#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
suppressWarnings(suppressMessages(library(WGCNA)))
library(expss)
library(polycor)
library(foreach) ; library(doParallel)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

Load preprocessed dataset (preprocessing code in 20_02_21_data_preprocessing.Rmd) and clustering (pipeline in 20_02_24_WGCNA.Rmd)

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame


# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_08-29-2019_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# Clusterings
clusterings = read_csv('./../Data/clusters.csv')


# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>% 
  mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
  left_join(GO_neuronal, by='ID') %>% left_join(clusterings, by='ID') %>%
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`), 
         significant=padj<0.05 & !is.na(padj))


rm(DE_info, GO_annotations, clusterings)

Dynamic Tree vs Dyamic Hybrid

print(paste0('Dynamic Tree leaves ', sum(genes_info$DynamicTree=='gray'), ' genes without cluster (', 
             round(mean(genes_info$DynamicTree=='gray')*100), '%)'))
## [1] "Dynamic Tree leaves 433 genes without cluster (3%)"
print(paste0('Dynamic Hybrid leaves ', sum(genes_info$DynamicHybrid=='gray'), ' genes without cluster (', 
             round(mean(genes_info$DynamicHybrid=='gray')*100,2), '%)'))
## [1] "Dynamic Hybrid leaves 229 genes without cluster (1.42%)"

Dynamic Tree leaves more genes without a cluster, but in previous experiments it returned cleaner results, so I’m going to see which genes are lost to see how big the damage is.

When studying all the brian regions together, there seemed to be a relation between DE and module membership, being DE a more restrictive condition than being assigned to a cluster. Here it’s not that easy to distinguish given the small amount of DE genes and genes left without a cluster, but they could still be related since genes without a cluser have a very small PC2 and DE genes very high absolute PC2

pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(genes_info, by='ID') %>% mutate('hasCluster'=DynamicTree!='gray', 
                                                      'hasSFARIScore'=`gene-score`!='None') %>%
            apply_labels(`gene-score`='SFARI Gene score', DynamicTree = 'Dynamic Tree Algorithm', 
                         significant = 'Differentially Expressed', hasCluster = 'Belongs to a Module',
                         hasSFARIScore = 'Has a SFARI Score', syndromic = 'Has syndromic tag')

p1 = plot_data %>% ggplot(aes(PC1, PC2, color=hasCluster)) + geom_point(alpha=1-0.9*plot_data$hasCluster) + 
  theme_minimal() + ggtitle('Genes are assigned to a cluster') + theme(legend.position='bottom')

p2 = plot_data %>% ggplot(aes(PC1, PC2, color=significant)) + geom_point(alpha=plot_data$significant+0.1) + 
  theme_minimal() + ggtitle('Genes found to be DE') + theme(legend.position='bottom')

grid.arrange(p1, p2, nrow=1)

rm(pca, p1, p2)

Most of the genes that don’t have a cluster are not differentially expressed, but almost all genes are not differentially expressed, so this isn’t that remarkable

cat(paste0(round(100*sum(!plot_data$hasCluster & !plot_data$significant)/sum(!plot_data$hasCluster)),
           '% of the genes that don\'t have a cluster are not differentially expressed\n'))
## 99% of the genes that don't have a cluster are not differentially expressed
cro(plot_data$significant, list(plot_data$hasCluster, total()))
 Belongs to a Module     #Total 
 FALSE   TRUE   
 Differentially Expressed 
   FALSE  430 15700   16130
   TRUE  3 21   24
   #Total cases  433 15721   16154

Most of the genes with a SFARI score are assigned to a cluster

cat(paste0(sum(plot_data$hasSFARIScore & !plot_data$hasCluster), ' of the SFARI genes (~',
           round(100*sum(plot_data$hasSFARIScore & !plot_data$hasCluster)/sum(plot_data$hasSFARIScore)),
           '%) are not assigned to any cluster\n'))
## 15 of the SFARI genes (~2%) are not assigned to any cluster
cro(plot_data$hasSFARIScore, list(plot_data$hasCluster, total()))
 Belongs to a Module     #Total 
 FALSE   TRUE   
 Has a SFARI Score 
   FALSE  418 14835   15253
   TRUE  15 886   901
   #Total cases  433 15721   16154

Conclusion:

The main ndifference between algorithms is that Dynamic Hybrid clusters outlier genes and Dynamic Tree leaves them out, so Dynamic Tree would give me a ‘cleaner’ group of genes to work with, without losing many SFARI genes, but Dynamic Hybrid has less and more balanced clusters

I think both options could be feasible, but I’m going to use the Dynamic Hybrid algorithm to keep more genes

Since Dynamic Hybrid returned so many modules, I’m going to use the smallest of the merged modules, DynamicHybridMergedSmall

clustering_selected = 'DynamicHybridMergedSmall'
genes_info$Module = genes_info[,clustering_selected]

Dynamic Hybrid Modules

*The colour of the modules is the arbitrary one assigned during the WGCNA algorithm, where the gray cluster actually represents all the genes that were left without a cluster (so it’s not actually a cluster).

cat(paste0('The Dynamic Hybrid algorithm created ', length(unique(genes_info$Module))-1, ' modules and leaves ',
           sum(genes_info$Module=='gray'), ' genes without a module.\n'))
## The Dynamic Hybrid algorithm created 149 modules and leaves 229 genes without a module.
table(genes_info$Module)
## 
## #00A6FF #00A8FF #00AAFE #00ABFC #00ADFA #00AFF8 #00B1F5 #00B2F3 #00B4F0 
##     105      14      33      66     111      78     174     558      33 
## #00B5ED #00B6EA #00B7E7 #00B811 #00B825 #00B931 #00B9E4 #00BA3C #00BAE1 
##      21      75      17      24      15     248      28      30      28 
## #00BB45 #00BB4D #00BBDA #00BBDD #00BC54 #00BCD6 #00BD5B #00BD62 #00BDD2 
##      80     224      66      45      16      25      20      19      42 
## #00BE68 #00BE6E #00BECA #00BECE #00BF74 #00BF7A #00BF80 #00BFC2 #00BFC6 
##      13      14      27     113      24     974      16      16     378 
## #00C085 #00C08A #00C08F #00C095 #00C0B1 #00C0B5 #00C0BA #00C0BE #00C199 
##     690      47      55      17      27     237      26      81      25 
## #00C19E #00C1A3 #00C1A8 #00C1AC #17A3FF #21B700 #35B600 #39A1FF #43B500 
##      19      16     189      12      45      84      27      22     107 
## #4D9FFF #4EB400 #58B300 #5C9DFF #61B200 #69B100 #6A9AFF #70B000 #7598FF 
##      69      28     172      37      42     162     213     222      79 
## #77AF00 #7DAE00 #8095FF #83AD00 #8993FF #89AC00 #8EAB00 #9290FF #94A900 
##      39     149     257      59      38      19      35      37      66 
## #98A800 #9A8EFF #9DA700 #A28BFF #A2A600 #A6A400 #AA88FF #ABA300 #AFA200 
##      14      77      60     222      39      34      56      24    1418 
## #B086FF #B3A000 #B783FF #B79F00 #BA9E00 #BD81FF #BE9C00 #C19B00 #C37EFF 
##     107      69      38      84      44      39      25      54      69 
## #C59900 #C87BFF #C89800 #CB9600 #CD79FF #CF9500 #D277FF #D29300 #D59100 
##      37      32     218      27      23      19      26      14      27 
## #D774FD #D79000 #DA8E00 #DB72FB #DD8D00 #DF70F9 #E08B00 #E28900 #E36EF6 
##      45     102     315     715     298     105     223     208      73 
## #E48800 #E66CF3 #E7861A #E9842A #EA6AF1 #EB8236 #ED68EE #ED8140 #EF7F49 
##     313      31      28      57      42      30      42     117      40 
## #F067EB #F17D51 #F265E7 #F37B59 #F464E4 #F57A60 #F67866 #F763E1 #F862DD 
##      18      23      52      38      23     229      17      97      35 
## #F8766D #F97573 #FA62D9 #FB7379 #FC61D5 #FC717F #FD61D1 #FD7085 #FE61CD 
##     138     221      25      38      47     190      70      66      20 
## #FE6E8A #FF61C1 #FF61C5 #FF61C9 #FF62BC #FF63B8 #FF64B3 #FF65AE #FF66A9 
##      32      13      38      50     136      24      86      15     853 
## #FF67A5 #FF68A0 #FF699A #FF6B95 #FF6C90    gray 
##      22      44      71     255      40     229
plot_data = table(genes_info$Module) %>% data.frame %>% arrange(desc(Freq))

ggplotly(plot_data %>% ggplot(aes(x=reorder(Var1, -Freq), y=Freq)) + geom_bar(stat='identity', fill=plot_data$Var1) + 
         ggtitle('Module size') + ylab('Number of genes') + xlab('Module') + theme_minimal() + 
         theme(axis.text.x = element_text(angle = 90)))


Relation to external clinical traits

Quantifying module-trait associations

In the WGCNA documentation they use Pearson correlation to calculate correlations, I think all of their variables were continuous. Since I have categorical variables I’m going to use the hetcor function, that calculates Pearson, polyserial or polychoric correlations depending on the type of variables involved.

  • I’m not sure how the corPvalueStudent function calculates the p-values and I cannot find any documentation…

  • Compared correlations using Pearson correlation and with hetcor and they are very similar, but a bit more extreme with hetcor. The same thing happens with the p-values.

datTraits = datMeta %>% dplyr::select(Diagnosis, Sex, Age, PMI, RNAExtractionBatch) %>%
            rename('ExtractionBatch' = RNAExtractionBatch)

# Recalculate MEs with color labels
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)

# Calculate correlation between eigengenes and the traits and their p-values
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits)$correlations[1,-1]) %>% t
rownames(moduleTraitCor) = colnames(MEs)
colnames(moduleTraitCor) = colnames(datTraits)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))

# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)

# In case there are any NAs
if(sum(!complete.cases(moduleTraitCor))>0){
  print(paste0(sum(is.na(moduleTraitCor)),' correlation(s) could not be calculated')) 
}
## [1] "2 correlation(s) could not be calculated"
rm(ME_object)

Note: The correlation between Module #E08B00 and Diagonsis is the one that cannot be calculated, weirdly enough, the thing that causes the error is that the initial correlation is too high, so it would be a very bad thing to lose this module because of this numerical error. I’m going to fill in its value using the polyserial function, which doesn’t give exactly the same results as the hetcor() function, but it’s quite similar.

# Calculate the correlation tha failed with hetcor()
moduleTraitCor['ME#E08B00','Diagnosis'] = polyserial(MEs[,'ME#E08B00'], datTraits$Diagnosis)
## Warning in polyserial(MEs[, "ME#E08B00"], datTraits$Diagnosis): initial
## correlation inadmissible, 1.07478833176392, set to 0.9999

Modules have very strong correlations with Diagnosis with really small p-values and not much relation with anything else. Perhaps a little with PMI and Brain Region.

They gray ‘module’ no longer has one of the lowest correlations with Diagnosis

# Sort moduleTraitCor by Diagnosis
moduleTraitCor = moduleTraitCor[order(moduleTraitCor[,1], decreasing=TRUE),]
moduleTraitPvalue = moduleTraitPvalue[order(moduleTraitCor[,1], decreasing=TRUE),]

# Filter some modules so the heatmap is easier to plot
# moduleTraitCor = moduleTraitCor[abs(moduleTraitCor[,1])>0.3,]
# moduleTraitPvalue = moduleTraitPvalue[abs(moduleTraitCor[,1])>0.3,]

# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)


labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels =  gsub('ME','',rownames(moduleTraitCor)), 
               yColorWidth=0, colors = brewer.pal(11,'PiYG'), bg.lab.y = gsub('ME','',rownames(moduleTraitCor)),
               textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.8, cex.lab.y = 0.75, zlim = c(-1,1),
               main = paste('Module-Trait relationships'))

diagnosis_cor = data.frame('Module' = gsub('ME','',rownames(moduleTraitCor)),
                           'MTcor' = moduleTraitCor[,'Diagnosis'],
                           'MTpval' = moduleTraitPvalue[,'Diagnosis'])

genes_info = genes_info %>% left_join(diagnosis_cor, by='Module')

rm(moduleTraitCor, moduleTraitPvalue, textMatrix, diagnosis_cor)

It’s harder to see with so little DE genes, but this plot still shows that Modules with a high Module-Diagnosis (absolute) correlation have a higher content of differentially expressed genes

plot_data = genes_info %>% group_by(Module, MTcor) %>% summarise(p = 100*mean(significant))

plot_data %>% ggplot(aes(MTcor, p)) + geom_hline(yintercept=mean(plot_data$p), color='gray', linetype='dotted') +
         geom_point(color=plot_data$Module, aes(id=Module)) + theme_minimal() + 
         xlab('Modules ordered by Module-Diagnosis correlation') + ylab('Percentage of differentially expressed genes')


Gene Significance and Module Membership

Gene significance: is the value between the correlation between the gene and the trait we are interested in. A positive gene significance means the gene is overexpressed and a negative value means its underexpressed. (The term ‘significance’ is not very acurate because it’s not actually measuring statistical significance, it’s just a correlation, but that’s how they call it in WGCNA…)

Module Membership is the correlation of the module’s eigengene and the expression profile of a gene. The higher the Module Membership, the more similar the gene is to the genes that constitute the module. (I won’t use this measure yet)

# It's more efficient to iterate the correlations one by one, otherwise it calculates correlations between the eigengenes and also between the genes, which we don't need

# Check if MM information already exists and if not, calculate it
if(file.exists(paste0('./../Data/dataset_', clustering_selected, '.csv'))){
  
  dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'))
  dataset$Module = dataset[,clustering_selected]
  
} else {
  
  ############# 1. Calculate Gene Significance
  GS_info = data.frame('ID' = rownames(datExpr),
                       'GS' = datExpr %>% apply(1, function(x) hetcor(x, datMeta$Diagnosis)$correlations[1,2])) %>%
            mutate('GSpval' = corPvalueStudent(GS, ncol(datExpr)))
  
  #############  2. Calculate Module Membership
  
  #setup parallel backend to use many processors
  cores = detectCores()
  cl = makeCluster(cores-1)
  registerDoParallel(cl)
  
  # Create matrix with MM by gene
  MM = foreach(i=1:nrow(datExpr), .combine=rbind) %dopar% {
    library(polycor)
    tempMatrix = apply(MEs, 2, function(x) hetcor(as.numeric(datExpr[i,]), x)$correlations[1,2])
    tempMatrix
  }
  
  # Stop clusters
  stopCluster(cl)
  
  rownames(MM) = rownames(datExpr)
  colnames(MM) = paste0('MM',gsub('ME','',colnames(MEs)))
  
  # Calculate p-values
  MMpval = MM %>% corPvalueStudent(ncol(datExpr)) %>% as.data.frame
  colnames(MMpval) = paste0('MMpval', gsub('ME','',colnames(MEs)))
  
  MM = MM %>% as.data.frame %>% mutate(ID = rownames(.))
  MMpval = MMpval %>% as.data.frame %>% mutate(ID = rownames(.))
  
  # Join and save results
  dataset = genes_info %>% dplyr::select(ID, `gene-score`, clustering_selected, MTcor, MTpval) %>%
            left_join(GS_info, by='ID') %>%
            left_join(MM, by='ID') %>%
            left_join(MMpval, by='ID')
  
  write.csv(dataset, file = paste0('./../Data/dataset_', clustering_selected, '.csv'), row.names = FALSE)
  
  rm(cores, cl) 
  
}

GS_missing = dataset$ID[is.na(dataset$GS)] %>% as.character

if(length(GS_missing)>0){
  
  print(paste0(length(GS_missing),' correlations between genes and Diagnosis could not be calculated, ',
               'calculating them with the polyserial function'))
  
  for(g in GS_missing){
    dataset$GS[dataset$ID == g] = polyserial(as.numeric(datExpr[g,]), datMeta$Diagnosis)
  }
  
}
## [1] "36 correlations between genes and Diagnosis could not be calculated, calculating them with the polyserial function"
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.00086601586701, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, 1.01005534381103, set to 0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.05225322768285, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, 1.00022572924371, set to 0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, 1.00057281088148, set to 0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.00517501596365, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, 1.06171755385438, set to 0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.01212301373417, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.00633029247751, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, 1.01472473316713, set to 0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, 1.0190691339179, set to 0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.00130194444989, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.02454488167405, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.00736735929184, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.00631216725183, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.00150362151356, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.01289562715944, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.01089055195498, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.00640240468909, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, 1.00446948637747, set to 0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, 1.01029028386243, set to 0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.07769738434974, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.00401974279988, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, 1.01591017247702, set to 0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.00556806651229, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.03825753999499, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.00814873439015, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.03097451577053, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.01283411453025, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.0954076788885, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.06630750224835, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.0414268492739, set to -0.9999
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.00676966148363, set to -0.9999
rm(GS_missing)


Analysing concordance between these metrics in the genes


1. Gene Significance vs Log Fold Change

Gene significance and Log Fold Chance are two different ways to measure the same thing, so there should be a concordance between them

Log Fold Change seems to have a wider range of values and Gene Significance values seem more uniformly distributed, but they do agree with each other

plot_data = dataset %>% dplyr::select(ID, MTcor, GS) %>% left_join(genes_info %>% dplyr::select(ID, gene.score), by='ID') %>%
            left_join(genes_info %>% dplyr::select(ID, baseMean, log2FoldChange, significant, Module), by='ID') %>%
            left_join(data.frame(MTcor=unique(dataset$MTcor)) %>% arrange(by=MTcor) %>% 
                                 mutate(order=1:length(unique(dataset$MTcor))), by='MTcor')

ggplotly(plot_data %>% ggplot(aes(GS, log2FoldChange)) + geom_point(color=plot_data$Module, alpha=0.5, aes(ID=Module)) + 
         geom_smooth(color='gray') + theme_minimal() + xlab('Gene Significance') + 
         ggtitle(paste0('Correlation = ', round(cor(plot_data$log2FoldChange, plot_data$GS)[1], 4))))


2. Module-Diagnosis correlation vs Gene Significance

In general, modules with the highest Module-Diagnosis correlation should have genes with high Gene Significance

Note: For the Module-Diagnosis plots, if you do boxplots, you lose the exact module-diagnosis correlation and you only keep the order, so I decided to compensate this downside with a second plot, where each point is plotted individually using their module’s Module-Diagnosis correlation as the x axis. I think the boxplot plot is easier to understand but the second plot contains more information, so I don’t know which one is better.

plot_data = plot_data %>% arrange(order)

ggplotly(plot_data %>% ggplot(aes(order, GS, group=order)) + geom_hline(yintercept=0, color='gray', linetype='dotted') +
         geom_boxplot(fill=unique(plot_data$Module)) + theme_minimal() + 
         xlab('Modules ordered by Module-Diagnosis correlation') + ylab('Gene Significance'))
plot_data %>% ggplot(aes(MTcor, GS)) + geom_hline(yintercept=0, color='gray', linetype='dotted') + 
         geom_point(color=plot_data$Module, alpha=0.1, aes(id=ID)) + geom_smooth(color='gray', alpha=0.3) + 
         theme_minimal() + xlab('Module-Diagnosis correlation') + ylab('Gene Significance') + 
         ggtitle(paste0('R^2=',round(cor(plot_data$MTcor, plot_data$GS)^2,4)))

3. Module-Diagnosis correlation vs Log Fold Change

The same should happen with the Log Fold Change

ggplotly(plot_data %>% ggplot(aes(order, log2FoldChange, group=order)) + geom_hline(yintercept=0, color='gray', linetype='dotted') +
         geom_boxplot(fill=unique(plot_data$Module)) + 
         theme_minimal() + xlab('Modules ordered by Module-Diagnosis correlation') + ylab('log2FoldChange'))
ggplotly(plot_data %>% ggplot(aes(MTcor, log2FoldChange)) + geom_hline(yintercept=0, color='gray', linetype='dotted') +
         geom_point(color=plot_data$Module, alpha=0.1, aes(id=ID)) + geom_smooth(color='gray', alpha=0.3) + 
         theme_minimal() + xlab('Module-Diagnosis correlation') + ylab('log2FoldChange') + 
         ggtitle(paste0('R^2=',round(cor(plot_data$MTcor, plot_data$log2FoldChange)^2,4))))


4. Module-Diagnosis vs Mean Expression

When studying this plot using samples from all brain regions, we can see a small delation between module-Diagnosis and mean expression that we could explain by what we had observed where overexpressed genes tended to have lower levels of expression than the overexpressed genes, but this patterns is no longer recognisable on this plot.

ggplotly(plot_data %>% ggplot(aes(order, log2(baseMean+1), group=order)) + 
         geom_hline(yintercept=mean(log2(plot_data$baseMean+1)), color='gray', linetype='dotted') +
         geom_boxplot(fill=unique(plot_data$Module)) + theme_minimal() + 
         xlab('Modules ordered by Module-Diagnosis correlation') + ylab('log2(Mean Expression)'))
plot_data %>% ggplot(aes(MTcor, log2(baseMean+1))) + geom_point(alpha=0.2, color=plot_data$Module, aes(id=ID)) + 
         geom_hline(yintercept=mean(log2(plot_data$baseMean+1)), color='gray', linetype='dotted') + 
         geom_smooth(color='gray', alpha=0.3) + theme_minimal() + xlab('Module-Diagnosis correlation') +
         ggtitle(paste0('R^2=',round(cor(plot_data$MTcor, log2(plot_data$baseMean+1))^2,4)))

Conclusion:

All of the variables seem to agree with each other, Modules with a high correlation with Diagnosis tend to have genes with high values of Log Fold Change as well as high values of Gene Significance, and the gray module, which groups all the genes that weren’t assigned to any cluster tends to have a very poor performance in all of the metrics.



SFARI Scores


Since SFARI scores genes depending on the strength of the evidence linking it to the development of autism, in theory, there should be some concordance between the metrics we have been studying above and these scores…


SFARI Scores vs Gene Significance


  • There is a big difference between this plot and the one that considers all the brain regions:

    • SFARI score 6 now has the lowest distribution than the other scores when before it was the highest

    • SFARI score 1 still has a lower median than the other scores, but not as distinctive as before

    • SFARI scores 2 to 5 have a higher median than genes with a neuronal-related annotation

  • In general, this plot shows more concordance between the experimental results and the SFARI genes than the one considering all of the samples

ggplotly(plot_data %>% ggplot(aes(gene.score, abs(GS), fill=gene.score)) + geom_boxplot() + 
         scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() + 
         ylab('abs(Gene Significance)') + xlab('SFARI Scores') + theme(legend.position='none'))

SFARI Scores vs Module-Diagnosis correlation

  • This plot got inverted completely:

    • The higher the SFARI score, the higher the Module-Trait correlation

    • SFARI scores 1 and 2 have significantly higher values of Module-Trait correlation than the rest of the groups

  • The group with the lowest Module-Diagnosis correlation is SFARI score 6, which is supposed to be the one with the least amount of evidence suggesting a relation to autism

ggplotly(plot_data %>% ggplot(aes(gene.score, abs(MTcor), fill=gene.score)) + geom_boxplot() + 
         scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() + 
         ylab('abs(Module-Diagnosis Correlation)') + xlab('SFARI Scores') + theme(legend.position='none'))

Conclusion:

This time, it seems that the SFARI gene scoring is relatively consistent with the other measurements. These conclusions are opposite to the ones obtained when studying samples from all the brain regions.



Session info

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] doParallel_1.0.15     iterators_1.0.12      foreach_1.4.7        
##  [4] polycor_0.7-10        expss_0.10.1          WGCNA_1.68           
##  [7] fastcluster_1.1.25    dynamicTreeCut_1.63-1 GGally_1.4.0         
## [10] gridExtra_2.3         viridis_0.5.1         viridisLite_0.3.0    
## [13] RColorBrewer_1.1-2    dendextend_1.13.3     plotly_4.9.2         
## [16] glue_1.3.1            reshape2_1.4.3        forcats_0.4.0        
## [19] stringr_1.4.0         dplyr_0.8.3           purrr_0.3.3          
## [22] readr_1.3.1           tidyr_1.0.2           tibble_2.1.3         
## [25] ggplot2_3.2.1         tidyverse_1.3.0      
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1                backports_1.1.5            
##   [3] Hmisc_4.2-0                 plyr_1.8.5                 
##   [5] lazyeval_0.2.2              splines_3.6.0              
##   [7] BiocParallel_1.20.1         crosstalk_1.0.0            
##   [9] GenomeInfoDb_1.22.0         robust_0.4-18.2            
##  [11] digest_0.6.24               htmltools_0.4.0            
##  [13] GO.db_3.10.0                fansi_0.4.1                
##  [15] magrittr_1.5                checkmate_1.9.4            
##  [17] memoise_1.1.0               fit.models_0.5-14          
##  [19] cluster_2.0.8               annotate_1.64.0            
##  [21] modelr_0.1.5                matrixStats_0.55.0         
##  [23] colorspace_1.4-1            blob_1.2.1                 
##  [25] rvest_0.3.5                 rrcov_1.4-7                
##  [27] haven_2.2.0                 xfun_0.8                   
##  [29] crayon_1.3.4                RCurl_1.95-4.12            
##  [31] jsonlite_1.6                genefilter_1.68.0          
##  [33] impute_1.60.0               survival_2.44-1.1          
##  [35] gtable_0.3.0                zlibbioc_1.32.0            
##  [37] XVector_0.26.0              DelayedArray_0.12.2        
##  [39] BiocGenerics_0.32.0         DEoptimR_1.0-8             
##  [41] scales_1.1.0                mvtnorm_1.0-11             
##  [43] DBI_1.1.0                   Rcpp_1.0.3                 
##  [45] xtable_1.8-4                htmlTable_1.13.1           
##  [47] foreign_0.8-71              bit_1.1-15.2               
##  [49] preprocessCore_1.48.0       Formula_1.2-3              
##  [51] stats4_3.6.0                htmlwidgets_1.5.1          
##  [53] httr_1.4.1                  ellipsis_0.3.0             
##  [55] acepack_1.4.1               pkgconfig_2.0.3            
##  [57] reshape_0.8.8               XML_3.99-0.3               
##  [59] farver_2.0.3                nnet_7.3-12                
##  [61] dbplyr_1.4.2                locfit_1.5-9.1             
##  [63] later_1.0.0                 tidyselect_0.2.5           
##  [65] labeling_0.3                rlang_0.4.4                
##  [67] AnnotationDbi_1.48.0        munsell_0.5.0              
##  [69] cellranger_1.1.0            tools_3.6.0                
##  [71] cli_2.0.1                   generics_0.0.2             
##  [73] RSQLite_2.2.0               broom_0.5.4                
##  [75] fastmap_1.0.1               evaluate_0.14              
##  [77] yaml_2.2.0                  knitr_1.24                 
##  [79] bit64_0.9-7                 fs_1.3.1                   
##  [81] robustbase_0.93-5           nlme_3.1-139               
##  [83] mime_0.9                    xml2_1.2.2                 
##  [85] compiler_3.6.0              rstudioapi_0.10            
##  [87] reprex_0.3.0                geneplotter_1.64.0         
##  [89] pcaPP_1.9-73                stringi_1.4.6              
##  [91] lattice_0.20-38             Matrix_1.2-17              
##  [93] vctrs_0.2.2                 pillar_1.4.3               
##  [95] lifecycle_0.1.0             data.table_1.12.8          
##  [97] bitops_1.0-6                httpuv_1.5.2               
##  [99] GenomicRanges_1.38.0        R6_2.4.1                   
## [101] latticeExtra_0.6-28         promises_1.1.0             
## [103] IRanges_2.20.2              codetools_0.2-16           
## [105] MASS_7.3-51.4               assertthat_0.2.1           
## [107] SummarizedExperiment_1.16.1 DESeq2_1.26.0              
## [109] withr_2.1.2                 S4Vectors_0.24.3           
## [111] GenomeInfoDbData_1.2.2      mgcv_1.8-28                
## [113] hms_0.5.3                   grid_3.6.0                 
## [115] rpart_4.1-15                rmarkdown_1.14             
## [117] Cairo_1.5-10                Biobase_2.46.0             
## [119] shiny_1.4.0                 lubridate_1.7.4            
## [121] base64enc_0.1-3